Code
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"Research Question: What does the job market look like for Business Analytics, Data Science, and ML professionals in 2024?
Narrative Flow: 1. Who’s hiring? → Top companies and industries 2. What roles exist? → Job titles within our occupation categories 3. How long do postings stay open? → Duration analysis by occupation 4. What do they want? → Skills in demand 5. What drives salary? → Key factors affecting compensation
Each insight builds toward our ML modeling decisions.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
pd.set_option('display.max_columns', None)
# Load data
df = pd.read_csv('data/lightcast_cleaned.csv')
df['POSTED'] = pd.to_datetime(df['POSTED'])
# Exclude unclassified/unknown values for cleaner analysis
df_clean = df[~df['NAICS_2022_2_NAME'].str.contains('Unclassified', na=False)].copy()
print(f"Dataset: {len(df):,} job postings")
print(f"Date range: {df['POSTED'].min().strftime('%b %Y')} - {df['POSTED'].max().strftime('%b %Y')}")
print(f"Occupations: {df['LOT_V6_OCCUPATION_NAME'].nunique()}")Dataset: 56,773 job postings
Date range: May 2024 - Sep 2024
Occupations: 4
# Top 10 Industries (excluding unclassified)
industry_counts = df_clean['NAICS_2022_2_NAME'].value_counts().head(10)
# Top 10 Companies
company_counts = df['COMPANY_NAME'].value_counts().head(10)
# Create side-by-side subplots
fig = go.Figure()
fig.add_trace(go.Bar(
y=industry_counts.index,
x=industry_counts.values,
orientation='h',
marker_color='steelblue',
name='Industries'
))
fig.update_layout(
title='Top 10 Industries Hiring Data Professionals',
xaxis_title='Number of Job Postings',
yaxis={'categoryorder': 'total ascending'},
template='plotly_white',
height=450,
showlegend=False
)
fig.write_image('figures/top_industries.png', scale=2)
fig.show()
# Print top companies
print("\nTop 10 Companies by Job Postings:")
for i, (company, count) in enumerate(company_counts.items(), 1):
print(f" {i}. {company}: {count:,} postings")
Top 10 Companies by Job Postings:
1. Unclassified: 3,025 postings
2. Deloitte: 2,352 postings
3. Accenture: 1,318 postings
4. PricewaterhouseCoopers: 697 postings
5. Merit America: 444 postings
6. Insight Global: 365 postings
7. Cardinal Health: 346 postings
8. Chewy: 319 postings
9. Smx Corporation Limited: 317 postings
10. Robert Half: 313 postings
# Top job titles within each occupation
fig = go.Figure()
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
occupations = df['LOT_V6_OCCUPATION_NAME'].unique()
for i, occ in enumerate(occupations):
df_occ = df[df['LOT_V6_OCCUPATION_NAME'] == occ]
top_titles = df_occ['TITLE_NAME'].value_counts().head(5)
fig.add_trace(go.Bar(
name=occ,
y=[f"{occ[:15]}... - {title[:25]}" for title in top_titles.index],
x=top_titles.values,
orientation='h',
marker_color=colors[i]
))
fig.update_layout(
title='Top 5 Job Titles per Occupation Category',
xaxis_title='Number of Postings',
yaxis={'categoryorder': 'total ascending'},
template='plotly_white',
height=600,
showlegend=True,
legend_title_text='Occupation',
barmode='stack'
)
fig.write_image('figures/job_titles_by_occupation.png', scale=2)
fig.show()
# Summary counts
print("\nPostings per Occupation:")
for occ, count in df['LOT_V6_OCCUPATION_NAME'].value_counts().items():
print(f" • {occ}: {count:,} ({count/len(df)*100:.1f}%)")
Postings per Occupation:
• Data / Data Mining Analyst: 26,718 (47.1%)
• Business Intelligence Analyst: 26,266 (46.3%)
• Business / Management Analyst: 3,657 (6.4%)
• Market Research Analyst: 132 (0.2%)
# Duration by Occupation
fig = px.box(
df,
x='LOT_V6_OCCUPATION_NAME',
y='DURATION',
color='LOT_V6_OCCUPATION_NAME',
title='Job Posting Duration by Occupation Type',
labels={'LOT_V6_OCCUPATION_NAME': '', 'DURATION': 'Days Posted'}
)
fig.update_layout(
template='plotly_white',
showlegend=False,
height=450,
xaxis_tickangle=-15
)
fig.write_image('figures/duration_by_occupation.png', scale=2)
fig.show()
# Summary stats
print("\nDuration Statistics by Occupation (days):")
duration_stats = df.groupby('LOT_V6_OCCUPATION_NAME')['DURATION'].agg(['median', 'mean', 'std']).round(1)
duration_stats.columns = ['Median', 'Mean', 'Std Dev']
print(duration_stats.sort_values('Median'))
# Overall
print(f"\nOverall median duration: {df['DURATION'].median():.0f} days")
Duration Statistics by Occupation (days):
Median Mean Std Dev
LOT_V6_OCCUPATION_NAME
Business Intelligence Analyst 30.0 34.9 24.1
Data / Data Mining Analyst 30.0 34.8 23.7
Business / Management Analyst 33.0 38.1 23.3
Market Research Analyst 34.0 37.4 23.6
Overall median duration: 30 days
# Extract skills
def extract_skills(skills_series):
all_skills = []
for skills in skills_series.dropna():
if isinstance(skills, str) and skills not in ['Not Listed', '']:
all_skills.extend([s.strip() for s in skills.split(',')])
return Counter(all_skills)
# General skills
skill_counts = extract_skills(df['SKILLS_NAME'])
top_skills = pd.DataFrame(skill_counts.most_common(15), columns=['Skill', 'Count'])
top_skills['Percentage'] = (top_skills['Count'] / len(df) * 100).round(1)
# Software skills
software_counts = extract_skills(df['SOFTWARE_SKILLS_NAME'])
top_software = pd.DataFrame(software_counts.most_common(10), columns=['Software', 'Count'])
top_software['Percentage'] = (top_software['Count'] / len(df) * 100).round(1)
# Combined visualization
fig = go.Figure()
fig.add_trace(go.Bar(
y=top_skills['Skill'],
x=top_skills['Percentage'],
orientation='h',
marker_color='#2E86AB',
name='General Skills'
))
fig.update_layout(
title='Top 15 Skills in Demand (% of All Job Postings)',
xaxis_title='% of Postings Requiring This Skill',
yaxis={'categoryorder': 'total ascending'},
template='plotly_white',
height=500,
showlegend=False
)
fig.write_image('figures/top_skills.png', scale=2)
fig.show()
# Print top software
print("\nTop 10 Software/Technical Skills:")
for i, row in top_software.iterrows():
print(f" {i+1}. {row['Software']}: {row['Percentage']}%")
Top 10 Software/Technical Skills:
1. SQL (Programming Language): 32.5%
2. Microsoft Excel: 19.7%
3. SAP Applications: 18.8%
4. Tableau (Business Intelligence Software): 18.3%
5. Dashboard: 18.0%
6. Python (Programming Language): 17.9%
7. Power BI: 16.7%
8. Microsoft Office: 11.3%
9. Microsoft PowerPoint: 10.7%
10. R (Programming Language): 9.0%
# Salary by Occupation AND Remote Type (multi-factor view)
df_remote = df[df['REMOTE_TYPE_NAME'] != 'Not Specified'].copy()
fig = px.box(
df_remote,
x='LOT_V6_OCCUPATION_NAME',
y='SALARY',
color='REMOTE_TYPE_NAME',
title='Salary by Occupation and Remote Work Type',
labels={
'LOT_V6_OCCUPATION_NAME': '',
'SALARY': 'Annual Salary ($)',
'REMOTE_TYPE_NAME': 'Work Type'
}
)
fig.update_layout(
template='plotly_white',
height=500,
xaxis_tickangle=-15,
legend_title_text='Remote Type'
)
fig.write_image('figures/salary_by_occupation_remote.png', scale=2)
fig.show()
# Summary table
print("\nMedian Salary by Occupation & Remote Type:")
pivot = df_remote.pivot_table(
values='SALARY',
index='LOT_V6_OCCUPATION_NAME',
columns='REMOTE_TYPE_NAME',
aggfunc='median'
).round(0)
print(pivot.applymap(lambda x: f"${x:,.0f}" if pd.notna(x) else "N/A"))
Median Salary by Occupation & Remote Type:
REMOTE_TYPE_NAME Hybrid Remote Not Remote Remote
LOT_V6_OCCUPATION_NAME
Business / Management Analyst $97,250 $97,250 $97,250
Business Intelligence Analyst $125,900 $125,900 $125,900
Data / Data Mining Analyst $95,300 $95,300 $95,300
Market Research Analyst $94,500 N/A $94,500
/tmp/ipykernel_7603/976825870.py:33: FutureWarning:
DataFrame.applymap has been deprecated. Use DataFrame.map instead.